{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "pop_df=pd.read_csv(r'C:\\Users\\Yasaman\\Downloads\\World_bank_population.csv',skiprows=3)\n",
    "pop_df=pop_df[['Country Code','2019']].dropna()\n",
    "pop_df['2019']=pop_df['2019'].astype(int)\n",
    "possible_countries=pop_df.query(\" `2019` >=1000000\")['Country Code'].values\n",
    "possible_countries=[x.lower() for x in possible_countries]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "excluded_iso3_codes = [\n",
    "    \"IRL\",  # Ireland\n",
    "    \"SSD\",  # South Sudan\n",
    "    \"SDN\",  # Sudan\n",
    "    \"COG\",  # Republic of the Congo\n",
    "    \"COD\",  # Democratic Republic of the Congo\n",
    "    \"GIN\",  # Guinea\n",
    "    \"GNB\",  # Guinea-Bissau\n",
    "    \"GNQ\",  # Equatorial Guinea\n",
    "    \"PNG\",  # Papua New Guinea\n",
    "    \"XKX\",  # Kosovo (unofficial)\n",
    "    \"MNE\",  # Montenegro\n",
    "    \"SRB\",  # Serbia\n",
    "    \"TLS\"   # Timor-Leste\n",
    "]\n",
    "excluded_iso3_codes=[c.lower() for c in excluded_iso3_codes]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "possible_iso=list(set(possible_countries)-set(excluded_iso3_codes))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(r\"C:\\Users\\Yasaman\\Downloads\\Attention-fractional counting.csv\")\n",
    "df=df[df['country'].isin(possible_iso)]\n",
    "df.rename(columns={'aggregated_value': 'count', 'country': 'Mention_country', 'affiliation_country': 'Aff_country'}, inplace=True)\n",
    "df = df[df['year'].isin(range(2002, 2020))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>year</th>\n",
       "      <th>subjarea</th>\n",
       "      <th>Mention_country</th>\n",
       "      <th>Aff_country</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2002</td>\n",
       "      <td>BUSI</td>\n",
       "      <td>usa</td>\n",
       "      <td>usa</td>\n",
       "      <td>387.287541</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>2002</td>\n",
       "      <td>ENVI</td>\n",
       "      <td>aus</td>\n",
       "      <td>aus</td>\n",
       "      <td>309.899627</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>2002</td>\n",
       "      <td>ENVI</td>\n",
       "      <td>ind</td>\n",
       "      <td>gbr</td>\n",
       "      <td>7.051852</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>2002</td>\n",
       "      <td>SOCI</td>\n",
       "      <td>esp</td>\n",
       "      <td>usa</td>\n",
       "      <td>11.219084</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>2002</td>\n",
       "      <td>MEDI</td>\n",
       "      <td>bel</td>\n",
       "      <td>bel</td>\n",
       "      <td>104.618141</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1772434</th>\n",
       "      <td>2011</td>\n",
       "      <td>SOCI</td>\n",
       "      <td>vnm</td>\n",
       "      <td>che</td>\n",
       "      <td>0.125000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1772435</th>\n",
       "      <td>2010</td>\n",
       "      <td>AGRI</td>\n",
       "      <td>ago</td>\n",
       "      <td>fin</td>\n",
       "      <td>0.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1772437</th>\n",
       "      <td>2012</td>\n",
       "      <td>EART</td>\n",
       "      <td>nor</td>\n",
       "      <td>irn</td>\n",
       "      <td>0.125000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1772439</th>\n",
       "      <td>2016</td>\n",
       "      <td>ECON</td>\n",
       "      <td>pan</td>\n",
       "      <td>gbr</td>\n",
       "      <td>0.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1772445</th>\n",
       "      <td>2018</td>\n",
       "      <td>ENGI</td>\n",
       "      <td>mdg</td>\n",
       "      <td>fin</td>\n",
       "      <td>0.027778</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1060351 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         year subjarea Mention_country Aff_country       count\n",
       "9        2002     BUSI             usa         usa  387.287541\n",
       "10       2002     ENVI             aus         aus  309.899627\n",
       "11       2002     ENVI             ind         gbr    7.051852\n",
       "12       2002     SOCI             esp         usa   11.219084\n",
       "13       2002     MEDI             bel         bel  104.618141\n",
       "...       ...      ...             ...         ...         ...\n",
       "1772434  2011     SOCI             vnm         che    0.125000\n",
       "1772435  2010     AGRI             ago         fin    0.333333\n",
       "1772437  2012     EART             nor         irn    0.125000\n",
       "1772439  2016     ECON             pan         gbr    0.500000\n",
       "1772445  2018     ENGI             mdg         fin    0.027778\n",
       "\n",
       "[1060351 rows x 5 columns]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of included countries:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "148"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print('Number of included countries:')\n",
    "df.Mention_country.nunique()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
